import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import os
import matplotlib.pyplot as plt
import scvelo as scv
from pybiomart import Server
from tqdm.notebook import tqdm
import itertools
import plotly.express as px
import math
import random
warnings.filterwarnings('ignore')
# In[2]:
get_ipython().run_line_magic('matplotlib', 'inline')
# In[3]:
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (4, 4)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
hostRoot = "-".join(socket.gethostname().split('-')[0:2])
#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir"
FinaLeaf = "/Progenitors"
externalDataNowaraw = "./data/Badhuri_atlas.h5ad"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
adataPath = outdir+"/BadhuriCurated.h5ad"
badhuriMarkers = ["FOS","DDIT3","CEBPD","HOPX","SUB1","HES4","NR2F1","NFIA","FOXO3","FOXP1","NFIB","TCF4","NFIX","HMGB2","SOX11","ZBTB18","BCL11A","ZNF462","ZIC2"]
Radial_cluster_label_aggr = ["radial"]
Radial_geschwindIngested = ["vRG"]
individualsToRemove = ["GW20"]
adata = sc.read_h5ad(adataPath)
adata.obs.columns
Index(['development_stage_ontology_term_id', 'individual', 'brain_region',
'cortical_area', 'lamina', 'cluster_label', 'ngene', 'numi',
'percent_mito', 'percent_ribo', 'ncount_rna', 'nfeature_rna',
'tissue_ontology_term_id', 'assay_ontology_term_id',
'disease_ontology_term_id', 'cell_type_ontology_term_id',
'ethnicity_ontology_term_id', 'is_primary_data',
'organism_ontology_term_id', 'sex_ontology_term_id', 'cell_type',
'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity',
'development_stage', 'S_score', 'G2M_score', 'phase',
'cluster_label_aggr', 'batch', 'cluster_label_area_age',
'cluster_label_area', 'GeschIngestedAnno', 'cluster_label_area_layers'],
dtype='object')
#Select dividing
adata = adata[(adata.obs["GeschIngestedAnno"].isin(Radial_geschwindIngested)) & (adata.obs["cluster_label_aggr"].isin(Radial_cluster_label_aggr))]
adata = adata[~adata.obs["individual"].isin(individualsToRemove)]
adata
View of AnnData object with n_obs × n_vars = 9106 × 1201
obs: 'development_stage_ontology_term_id', 'individual', 'brain_region', 'cortical_area', 'lamina', 'cluster_label', 'ngene', 'numi', 'percent_mito', 'percent_ribo', 'ncount_rna', 'nfeature_rna', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'S_score', 'G2M_score', 'phase', 'cluster_label_aggr', 'batch', 'cluster_label_area_age', 'cluster_label_area', 'GeschIngestedAnno', 'cluster_label_area_layers'
var: 'feature_name', 'feature_reference', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'GeschIngestedAnno_colors', 'X_normalization', 'batch_colors', 'cell_type_colors', 'cluster_label_aggr_colors', 'cluster_label_area_colors', 'cluster_label_area_layers_colors', 'cortical_area_colors', 'hvg', 'individual_colors', 'neighbors', 'pca', 'phase_colors', 'schema_version', 'tissue_colors', 'title', 'umap'
obsm: 'X_fastpca', 'X_pca', 'X_umap', 'pca_harmony'
varm: 'PCs'
layers: 'norm_nolog'
obsp: 'connectivities', 'distances'
adata.obs[["GeschIngestedAnno","cluster_label_aggr"]]
| GeschIngestedAnno | cluster_label_aggr | |
|---|---|---|
| gw19_2_PFC_AGTTCGATCGACCCAG | vRG | radial |
| gw19_2_PFC_GCTTGGGCAATTTCTC | vRG | radial |
| gw19_2_Motor_AACCACAAGAGCTGCA | vRG | radial |
| gw19_2_Motor_ACGGAAGTCCACGTGG | vRG | radial |
| gw19_2_Motor_ATCGCCTGTACAGTCT | vRG | radial |
| ... | ... | ... |
| GW16_V1_GGTTCTCTCATCGCAA | vRG | radial |
| GW16_V1_GGTTGTACAACCGCTG | vRG | radial |
| GW16_V1_TAAGTCGAGTTGGAGC | vRG | radial |
| GW16_V1_TCAAGACGTTGCTCCT | vRG | radial |
| GW16_V1_TTGCGTCCATGAGGGT | vRG | radial |
9106 rows × 2 columns
adata = adata.raw.to_adata()
adata.raw = adata.copy()
sc.pp.scale(adata, max_value=20)
sc.tl.pca(adata, svd_solver='arpack')
import scanpy.external as sce
sce.pp.harmony_integrate(adata,key="batch", max_iter_harmony = 20, adjusted_basis = "pca_harmony")
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:04)
2023-04-26 19:22:27,935 - harmonypy - INFO - Iteration 1 of 20 2023-04-26 19:22:32,344 - harmonypy - INFO - Iteration 2 of 20 2023-04-26 19:22:41,196 - harmonypy - INFO - Converged after 2 iterations
sc.pp.neighbors(adata, n_neighbors= 100, n_pcs=50, use_rep="pca_harmony")
sc.tl.umap(adata)
sc.tl.diffmap(adata)
computing neighbors
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:29)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:00)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
finished (0:00:00)
eigenvalues of transition matrix
[1. 0.93052393 0.90235215 0.89221084 0.88726026 0.8771377
0.86251444 0.8563181 0.8271251 0.81704575 0.8101579 0.8002406
0.7958678 0.79323226 0.78669226]
finished: added
'X_diffmap', diffmap coordinates (adata.obsm)
'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
sc.pl.diffmap(adata, size = 100,color=["batch","cluster_label_aggr","GeschIngestedAnno","individual","TOP2A"], components=["1,3","1,2","3,4","2,3"], ncols=2)
sc.tl.leiden(adata, resolution=.5, key_added="leiden")
running Leiden clustering
finished: found 6 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:13)
sc.pl.umap(adata, size = 30,color=["batch","cortical_area","cluster_label_aggr","GeschIngestedAnno","individual","TOP2A","leiden","DCX","HOPX","S100B","phase"], ncols=2, vmin="p1",vmax="p99")